
# Load Libraries
library(tidyr)
library(dplyr)
library(digest)

# Load data
mydata <- read.csv(file = "data/study_1/_raw_data.csv", head = T, sep = c(","))

# Extract interesting (+ descriptive) columns
cols <- c("ipAddress","X_recordId","QID1_TEXT","QID493_TEXT",
          "X1_QID692","X1_QID695_1","X1_QID694_FIRST_CLICK","X1_QID694_LAST_CLICK","X1_QID694_CLICK_COUNT",
          "X2_QID692","X2_QID695_1","X2_QID694_FIRST_CLICK","X2_QID694_LAST_CLICK","X2_QID694_CLICK_COUNT",
          "X3_QID692","X3_QID695_1","X3_QID694_FIRST_CLICK","X3_QID694_LAST_CLICK","X3_QID694_CLICK_COUNT",
          "X4_QID692","X4_QID695_1","X4_QID694_FIRST_CLICK","X4_QID694_LAST_CLICK","X4_QID694_CLICK_COUNT",
          "X5_QID692","X5_QID695_1","X5_QID694_FIRST_CLICK","X5_QID694_LAST_CLICK","X5_QID694_CLICK_COUNT",
          "X6_QID692","X6_QID695_1","X6_QID694_FIRST_CLICK","X6_QID694_LAST_CLICK","X6_QID694_CLICK_COUNT",
          "X7_QID692","X7_QID695_1","X7_QID694_FIRST_CLICK","X7_QID694_LAST_CLICK","X7_QID694_CLICK_COUNT",
          "X8_QID692","X8_QID695_1","X8_QID694_FIRST_CLICK","X8_QID694_LAST_CLICK","X8_QID694_CLICK_COUNT",
          "X9_QID692","X9_QID695_1","X9_QID694_FIRST_CLICK","X9_QID694_LAST_CLICK","X9_QID694_CLICK_COUNT",
          "X10_QID692","X10_QID695_1","X10_QID694_FIRST_CLICK","X10_QID694_LAST_CLICK","X10_QID694_CLICK_COUNT",
          "QID351_TEXT","QID698","QID727","QID750_TEXT","QID696","QID699","QID700","QID23",
          "QID7_TEXT","QID539","QID8","QID9","QID10",
          "imgurl0","imgurl1","imgurl2","imgurl3","imgurl4","imgurl5","imgurl6","imgurl7","imgurl8","imgurl9","condition","webCounter")

mydata2 <- mydata[,cols]

# Rename columns
colnames(mydata2) <- c("IP","RowID","TurkerID","Transcription",
                         "Esc0","Conf0","Time_Esc0","Time_Conf0","No_Clicks0",
                         "Esc1","Conf1","Time_Esc1","Time_Conf1","No_Clicks1",
                         "Esc2","Conf2","Time_Esc2","Time_Conf2","No_Clicks2",
                         "Esc3","Conf3","Time_Esc3","Time_Conf3","No_Clicks3",
                         "Esc4","Conf4","Time_Esc4","Time_Conf4","No_Clicks4",
                         "Esc5","Conf5","Time_Esc5","Time_Conf5","No_Clicks5",
                         "Esc6","Conf6","Time_Esc6","Time_Conf6","No_Clicks6", 
                         "Esc7","Conf7","Time_Esc7","Time_Conf7","No_Clicks7",
                         "Esc8","Conf8","Time_Esc8","Time_Conf8","No_Clicks8",
                         "Esc9","Conf9","Time_Esc9","Time_Conf9","No_Clicks9",
                         "Description","Estimate","Factors","OtherFactors","Familiar","Played","Escaped","Participated",
                         "Age","Gender","Education","Income","Political",
                         "Img0","Img1","Img2","Img3","Img4","Img5","Img6","Img7","Img8","Img9","Condition","WebCounter")

# Remove first row (it's just another heading)
mydata3 <- mydata2[-1,]

# Remove duplicate turkers
mydata3 <- mydata2[!(duplicated(mydata2$TurkerID) | duplicated(mydata2$TurkerID, fromLast = TRUE)),]

# Extract records that answered all questions
mydata4 <- mydata3[mydata3$Political!="",]

# Remove records that did not transcribe properly
mydata5 <- mydata4[nchar(as.character(mydata4[,"Transcription"]))>=160,]

# Check for balance
summary(mydata5$Condition)

# Extract only Image assessment columns
imgCols <- c("TurkerID",
             "Esc0","Conf0","Time_Esc0","Time_Conf0","No_Clicks0",
             "Esc1","Conf1","Time_Esc1","Time_Conf1","No_Clicks1",
             "Esc2","Conf2","Time_Esc2","Time_Conf2","No_Clicks2",
             "Esc3","Conf3","Time_Esc3","Time_Conf3","No_Clicks3",
             "Esc4","Conf4","Time_Esc4","Time_Conf4","No_Clicks4",
             "Esc5","Conf5","Time_Esc5","Time_Conf5","No_Clicks5",
             "Esc6","Conf6","Time_Esc6","Time_Conf6","No_Clicks6", 
             "Esc7","Conf7","Time_Esc7","Time_Conf7","No_Clicks7",
             "Esc8","Conf8","Time_Esc8","Time_Conf8","No_Clicks8",
             "Esc9","Conf9","Time_Esc9","Time_Conf9","No_Clicks9",
             "Img0","Img1","Img2","Img3","Img4","Img5","Img6","Img7","Img8","Img9","Condition","WebCounter")

mydata.img <- mydata5[,imgCols]

mydata.img <- as.data.frame(sapply(mydata.img,function(x) gsub("YES, they escaped the room", "Y", x)))
mydata.img <- as.data.frame(sapply(mydata.img,function(x) gsub("NO, they did NOT escape the room", "N", x)))
mydata.img <- as.data.frame(sapply(mydata.img,function(x) gsub("http://web.media.mit.edu/~awad/photos_blurred/", "", x)))
mydata.img <- as.data.frame(sapply(mydata.img,function(x) gsub(".jpg", "", x)))


#Converting factors
asNumeric <- function(x) as.numeric(as.character(x))
factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],   
                                                   asNumeric))

asCharacter <- function(x) as.character(x)
factorsCharacter <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)],   
                                                     asCharacter))

# Gathering
mydata.img.Esc <- mydata.img[,c(1,seq(2,47,5),62:63)]
mydata.img.Esc2 <- mydata.img.Esc %>% 
                gather(Key, Answer, -TurkerID,-Condition,-WebCounter) %>%
                extract(Key, c("Question", "ImgOrder"),"(Esc)([[:digit:]])")

mydata.img.Conf <- mydata.img[,c(1,seq(3,48,5),62:63)]
mydata.img.Conf[,2:11] <- factorsNumeric(mydata.img.Conf[,2:11])
mydata.img.Conf2 <- mydata.img.Conf %>% 
              gather(Key, Answer, -TurkerID,-Condition,-WebCounter) %>%
              extract(Key, c("Question", "ImgOrder"),"(Conf)([[:digit:]])")


mydata.img.TimeEsc <- mydata.img[,c(1,seq(4,49,5),62:63)]
mydata.img.TimeEsc[,2:11] <- factorsNumeric(mydata.img.TimeEsc[,2:11])
mydata.img.TimeEsc2 <- mydata.img.TimeEsc %>% 
  gather(Key, Answer, -TurkerID,-Condition,-WebCounter) %>%
  extract(Key, c("Question", "ImgOrder"),"(Time_Esc)([[:digit:]])")


mydata.img.TimeConf <- mydata.img[,c(1,seq(5,50,5),62:63)]
mydata.img.TimeConf[,2:11] <- factorsNumeric(mydata.img.TimeConf[,2:11])
mydata.img.TimeConf2 <- mydata.img.TimeConf %>% 
  gather(Key, Answer, -TurkerID,-Condition,-WebCounter) %>%
  extract(Key, c("Question", "ImgOrder"),"(Time_Conf)([[:digit:]])")

mydata.img.Clicks <- mydata.img[,c(1,seq(6,51,5),62:63)]
mydata.img.Clicks[,2:11] <- factorsNumeric(mydata.img.Clicks[,2:11])
mydata.img.Clicks2 <- mydata.img.Clicks %>% 
  gather(Key, Answer, -TurkerID,-Condition,-WebCounter) %>%
  extract(Key, c("Question", "ImgOrder"),"(No_Clicks)([[:digit:]])")

mydata.img.Img <- mydata.img[,c(1,seq(52,61),62:63)]
mydata.img.Img[,2:11] <- factorsCharacter(mydata.img.Img[,2:11])
mydata.img.Img2 <- mydata.img.Img %>% 
  gather(Key, Answer, -TurkerID,-Condition,-WebCounter) %>%
  extract(Key, c("Question", "ImgOrder"),"(Img)([[:digit:]])")

mydata.img.All <-
  rbind(
    mydata.img.Esc2,
    mydata.img.Conf2,
    mydata.img.TimeEsc2,
    mydata.img.TimeConf2,
    mydata.img.Clicks2,
    mydata.img.Img2
  )

mydata.img.final <- mydata.img.All %>% spread(Question,Answer)

# select only the relevant columns
mydata.img.final <- mydata.img.final[,c("TurkerID", "Condition", "Img", "Esc")]

# annonymize Turker ID 
mydata.img.final$TurkerID <- sapply(mydata.img.final$TurkerID, digest)

# OUTPUT CSV
write.csv(mydata.img.final, file = "data/study_1/responses_per_image.csv", row.names = F)

# responses factors CSV
mydata.factors <- mydata5[,c("TurkerID","Factors")]

mydata.factors$TurkerID <- sapply(mydata.factors$TurkerID, digest)

write.csv(mydata.factors, file = "data/study_1/responses_factors.csv", row.names = F)

# END